import json
import pandas as pd
import plotly.express as px
import os
import plotly.graph_objects as go
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from bertopic import BERTopic
#data="places.json"
data="dataset/sample_20210501.json"
with open(data, 'r') as f:
data = json.load(f)
print(len(data["places"]))
places=data["places"]
df = pd.DataFrame(places)
df["properties"].iloc[0]
df.shape[0]
df_ids=df.groupby(['place_id']).size().reset_index()
df_ids=df_ids.rename(columns={0: "number_of_times"}).sort_values(by=['number_of_times'], ascending=False)
df_ids
We are going to separete the elements stored in each tag list into new rows.
df["tags"][0:5]
df_tags=df.explode('tags')
df_tags
g_tags=df_tags.groupby(['tags']).size().reset_index()
g_tags=g_tags.rename(columns={0: "number_of_times"}).sort_values(by=['number_of_times'], ascending=False)
g_tags
px.histogram(g_tags, x="tags", y="number_of_times", histfunc="sum", color="tags", title='Frequency of tags places')
df["town"][1:10]
df_town=df.dropna(subset=['town'])
town=df_town.groupby(['town']).size().reset_index()
town=town.rename(columns={0: "number_of_times"})
town=town.drop([0])
town=town.sort_values(by=['number_of_times'], ascending=False)
town
px.scatter(town, x='town', y='number_of_times', color='number_of_times', size="number_of_times", size_max=60, title="Frequency of places grouped by towns")
df_name_town=df.groupby(['name']).size().reset_index()
df_name_town=df_name_town.rename(columns={0: "number_of_times"})
df_name_town=df_name_town.sort_values(by=['number_of_times'], ascending=False)
df_name_town.reset_index()
df_name_town=df.groupby(['name', 'town']).size().reset_index()
df_name_town=df_name_town.rename(columns={0: "number_of_times"})
df_name_town=df_name_town.sort_values(by=['number_of_times'], ascending=False)
df_name_town
df_properties=pd.concat([df.drop(['properties'], axis=1), df['properties'].apply(pd.Series)], axis=1)
df_properties[0:3]
df_properties_wc=df_properties.groupby(['place.facilities.wheelchair-access', 'town']).size().reset_index()
df_properties_wc=df_properties_wc.rename(columns={0: "number_of_times"})
df_properties_wc=df_properties_wc.sort_values(by=['number_of_times'], ascending=False)
df_properties_wc
df_properties_td=df_properties.groupby(['place.facilities.toilets_disabled', 'town']).size().reset_index()
df_properties_td=df_properties_td.rename(columns={0: "number_of_times"})
df_properties_td=df_properties_td.sort_values(by=['number_of_times'], ascending=False)
df_properties_td
df_descriptions=df.explode('descriptions')
df_descriptions=pd.concat([df_descriptions.drop(['descriptions'], axis=1), df_descriptions['descriptions'].apply(pd.Series)], axis=1)
df_descriptions=df_descriptions.dropna(subset=['description']).reset_index()
documents=df_descriptions["description"].values
len(documents)
import re
from gensim.parsing.preprocessing import remove_stopwords
def clean_documents(text):
text = re.sub(r'\S*@\S*\s?', '', text, flags=re.MULTILINE) # remove email
text = re.sub(r'http\S+', '', text, flags=re.MULTILINE) # remove web addresses
text = re.sub("\'", "", text) # remove single quotes
text = remove_stopwords(text)
return text
d=[]
for text in documents:
d.append(clean_documents(text))
model = SentenceTransformer('all-MiniLM-L6-v2')
#Training our text_embeddings - using the descriptions available & all-MiniLM-L6-v2 Transformer
text_embeddings = model.encode(d, batch_size = 8, show_progress_bar = True)
np.shape(text_embeddings)
similarities = cosine_similarity(text_embeddings)
similarities_sorted = similarities.argsort()
id_1 = []
id_2 = []
score = []
for index,array in enumerate(similarities_sorted):
p=len(array)
id_1.append(index)
id_2.append(array[-2])
score.append(similarities[index][array[-2]])
index_df = pd.DataFrame({'id_1' : id_1,
'id_2' : id_2,
'score' : score})
print(index_df)
index_df["score"].sort_values(ascending=False)
index_df.iloc[51]
NOTE: Documents 51 and 52 seems to be the most similar. Lets see what they have
documents[51]
documents[52]
topic_model = BERTopic(min_topic_size=10).fit(d, text_embeddings)
topics, probs = topic_model.transform(d, text_embeddings)
topic_model.visualize_topics()
topic_model.visualize_barchart()
topic_model.visualize_heatmap()
topic_model.get_topic_freq()